Data Processing

Show code
suppressPackageStartupMessages(library(data.table))
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(lubridate))
suppressPackageStartupMessages(library(scales))
suppressPackageStartupMessages(library(purrr))
suppressPackageStartupMessages(library(tidycensus))
suppressPackageStartupMessages(library(readxl))
suppressPackageStartupMessages(library(pdftools))
suppressPackageStartupMessages(library(stringr))

suppressPackageStartupMessages(library(skimr))
suppressPackageStartupMessages(library(knitr))
suppressPackageStartupMessages(library(kableExtra))

suppressPackageStartupMessages(library(ggmap))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(usmap))
suppressPackageStartupMessages(library(cowplot))
suppressPackageStartupMessages(library(RColorBrewer))

Data Access. ICD-10 mortality documentation, parts 4–6 of the ICD-10 mortality data set (covering 2013 onward, chosen to align with the years analyzed in the U.S. mortality data set), along with the corresponding country codes, country-year availability tables, reference population data, and live-birth counts, were downloaded from the WHO.

Summary. The mortality data set provides only country codes and ICD-10 mortality codes, without country names or cause-of-death descriptions. To make the data interpret-able, I first merged the data set with reference files containing country codes, country-year availability, population size, and live-birth counts. Additionally, since the ICD-10 Mortality Tabulation List is not available in a downloadable spreadsheet, I extracted the relevant tables directly from the ICD-10 mortality documentation (PDF) and constructed a mapping file that links ICD-10 codes to their corresponding causes of death. This mapping file was then merged with the mortality data set, enabling each record to be associated with a meaningful cause-of-death category.

Show code
idir                         <- "./data"
file_mort1                   <- "Morticd10_part4"
file_mort2                   <- "Morticd10_part5"
file_mort3                   <- "Morticd10_part6"
file_pop_livebirth           <- "pop"
file_country_code            <- "country_codes"
file_country_yr_availability <- "list_ctry_years_feb2025.xlsx"
file_documentation           <- "WHO_Mortality_Database_Documentation.pdf"

Processing County-Level Mortality, County-Level Population and Live Births, Country Code, and Country-Year Availability

Show code
# Country code, country-year availability
df_country_code <- fread(paste0(idir, "/", file_country_code))

df_country_yr_avability <- read_excel(paste0(idir, "/", file_country_yr_availability), 
                                      sheet = "avail_pop", skip = 7) %>% 
  select(-any_of(c("Admin1", "SubDiv", "Updates"))) %>%
  mutate(avail = 1)

# Population and live births by country
df_pop <- fread(paste0(idir, "/", file_pop_livebirth)) %>%
    filter(Year > 2013) %>%
    select(-c(Admin1, SubDiv, Frmat)) %>%
    pivot_longer(
      cols = starts_with("Pop"),
      names_to = "Age_group",
      values_to = "Pop_count"
    ) 

# Mortality
read_dta <- function(filename) {
  df <- fread(paste0(idir, "/", filename)) %>%
    filter(Year > 2013) %>%
    select(-c(Admin1, SubDiv, List, Frmat, IM_Frmat, starts_with("IM_Deaths"))) %>%
    pivot_longer(
      cols = starts_with("Deaths"),
      names_to = "Age_group",
      values_to = "Death_count"
    ) 
  return(df)
}

df_raw1 <- read_dta(file_mort1)
df_raw2 <- read_dta(file_mort2)
df_raw3 <- read_dta(file_mort3)

df_mortality <- rbind(df_raw1, df_raw2, df_raw3)
rm(df_raw1, df_raw2, df_raw3)

I first checked whether the country-code reference file and the country-year availability data set contained consistent country names and country codes. Unfortunately, there are six countries that lack country-year availability information. To understand whether these countries should still be included in the analysis, I examined whether they have corresponding mortality records in the WHO data set:

Show code
countries_no_availability <- anti_join(df_country_code %>% 
                                         rename(Country = country) %>% 
                                         distinct(), 
                                       df_country_yr_avability %>% 
                                         select(Country, name) %>% 
                                         distinct(), 
                                       by = c("Country", "name")) %>%
  filter(Country %in% df_mortality$Country) %>%
  rename(Code = Country, 
         Country = name)

tmp <- list()
for (i in seq_len(nrow(countries_no_availability))) {
  country <- countries_no_availability$Country[i]
  code    <- countries_no_availability$Code[i]
  
  total_deaths <- df_mortality %>%
    filter(Country == code) %>%
    pull(Death_count) %>%
    sum(na.rm = T)
  tmp[[i]] <- data.frame(Code = code, 
                         Country = country,
                         Total_Deaths = total_deaths, 
                         stringsAsFactors = FALSE) 
}
kable(do.call(rbind, tmp), align = c("l", "l", "l"))
Code Country Total_Deaths
1303 Mayotte 14630
2005 Anguilla 1972
2025 Aruba 19812
3405 United Arab Emirates 111762
4195 North Macedonia 793160
5108 Micronesia (Federated States of) 10684
Show code
rm(tmp, code, country, i, total_deaths)

Since several countries lacked availability information still had corresponding mortality records, I began by merging the mortality data set with the country-code reference file to attach readable country names to each numeric country code. Next, I merged the mortality data with the country–year availability data set to determine which countries had population and availability information. Any country that appeared in the mortality data set but not in the availability data set was retained in the final merged data frame, with the corresponding availability fields recorded as NA.

Show code
df <- inner_join(df_mortality, df_country_code, by=c("Country" = "country")) 
df <- left_join(df, df_country_yr_avability, by = c("Country", "name", "Year")) %>% 
  select(-Country) %>%
  rename(Country = name) %>%
  select(Country, everything()) %>%
  mutate(avail = replace_na(avail, 0))

count_sex <- table(df$Sex, useNA = "ifany")
kable(t(data.frame(
  Country = length(unique(df$Country)),
  Cause_of_Death = length(unique(df$Cause)),
  Years   = paste(sort(unique(df$Year)), collapse = ", "),
  Sex     = paste0("male: ",   
                   100 * round((count_sex["1"]) / nrow(df), 4), "%, ",
                   "female: ", 
                   100 *round((count_sex["2"]) / nrow(df), 4), "%, ",
                   "unknown ", 
                   100 * round((count_sex["9"]) / nrow(df), 4), "%"),
  stringsAsFactors = FALSE
  )), 
  align = c("l", "l"))
Country 134
Cause_of_Death 10880
Years 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023
Sex male: 52.09%, female: 47.47%, unknown 0.43%
Show code
rm(count_sex, df_country_code, df_country_yr_avability, df_mortality, df_pop, countries_no_availability)

I summarized the number of countries, the number of distinct causes of death, the range of years, and the sex distribution in the dataset, and found that it includes 134 countries, 10,880 cause entries, accross ten years of data (2014–2023), and a sex distribution of 52.05% male, 47.47% female, and 0.43% unknown. Among these, the 10,880 entries correspond to detailed list numbers from ICD-10, rather than the standardized 4-digit cause codes typically used for analysis. Because the goal is to classify mortality by the official ICD-10 four-digit codes and their corresponding cause categories, I will need to extract the ICD-10 Mortality Tabulation List to map detailed list numbers into meaningful cause groups.

ICD-10 Tabulation List

Processing ICD-10 Code Mapping

Since the ICD-10 Mortality Tabulation List is not available in a downloadable spreadsheet, I extracted the table located between pages 41–44 of the WHO ICD-10 mortality documentation. Specifically, I included only the content that appears below “Table 8. ICD-10 Mortality Tabulation List 1” and above “Table 9. ICD-10 Detailed Codes (3rd and 4th characters).” Next, I treated any line beginning with a four-digit WHO code as the start of a new record. I then trimmed leading and trailing whitespace, split each line into its component columns, and constructed a clean data frame from the resulting structured text. Within the resulting data frame, I removed the header row (“Code,” “Detailed List of Numbers,” and “Cause”), filtered out lines that were incorrectly captured as page numbers, collapsed multiple spaces into a single space, dropped rows that were broken across lines and manually reconstructed them, and standardized hyphens by replacing ” - ” with “-”. The resulting ICD 10 Mortality Tabulation List looks like:

Show code
txt <- pdf_text(file.path(idir, file_documentation))[41:44]

lines       <- unlist(strsplit(txt, "\n", fixed = TRUE))
start_idx   <- grep("^Table 8\\. ICD 10 Mortality Tabulation List 1", lines)
end_idx     <- grep("^Table 9\\. ICD 10 detailed codes \\(3rd and 4th characters\\)", lines)
table_lines <- lines[(start_idx + 1):(end_idx - 1)]

fixed_lines <- character(0)

for (ln in table_lines) {
  if (grepl("^\\s*\\d{4}\\b", ln)) {
    fixed_lines <- c(fixed_lines, ln)
  } else {
    if (length(fixed_lines) > 0) {
      fixed_lines[length(fixed_lines)] <- paste(
        fixed_lines[length(fixed_lines)],
        str_squish(ln)
      )}}}

fixed_lines_trim <- str_trim(fixed_lines)
split_mat <- str_split_fixed(fixed_lines_trim, "\\s{2,}", n = 3)

df_icd10 <- data.frame(
  Code   = split_mat[, 1],
  Code_range = split_mat[, 2],
  Cause = split_mat[, 3],
  stringsAsFactors = F)

df_icd10 <- df_icd10 %>%
  mutate(across(
    everything(),
    ~ str_replace_all(.x, "Code\\s+Detailed\\s+List\\s+Numbers\\s+Cau.*", ""))) %>%
  mutate(across(
    everything(),
    ~ str_replace(.x, "Whooping cough\\s+39\\b", "Whooping cough"))) %>%
  mutate(across(
    everything(),
    ~ str_replace(.x, "Leukaemia\\s+40\\b", "Leukaemia"))) %>%
  mutate(across(
    everything(),
    ~ str_replace(.x,
      "Diseases of the skin and subcutaneous tissue\\s+41\\b",
      "Diseases of the skin and subcutaneous tissue"))) %>%
  mutate(across(
    everything(),
    ~ str_replace(.x,
        "^Schistosomiasis\\s+.*$",
        "Schistosomiasis"))) %>%
  mutate(across(
    everything(),
    ~ str_replace_all(.x, "\\s{2,}", " "))) %>%
  mutate(across(
    where(is.character),
    ~ gsub(" - ", "-", .x))) %>%
  filter(!Code %in% c("1025", "1046", "1103"))

manual_1025 <- data.frame(
  Code = "1025",
  Code_range = "A21-A32, A38, A42-A49, A65-A79, A81, A83-A89, B00-B04, B06-B09, B25-B49, B58-B64, B66-B94, B99",
  Cause = "Remainder of certain infectious and parasitic diseases",
  stringsAsFactors = F)

manual_1046 <- data.frame(
  Code = "1046",
  Code_range =
    "C17, C23-C24, C26-C31, C37-C41, C44-C49, C51-C52, C57-C60, C62-C66, C68-C69, C73-C81, C88, C96-C97",
  Cause = "Remainder of malignant neoplasms",
  stringsAsFactors = F)

manual_1103 <- data.frame(
  Code = "1103",
  Code_range = "W20-W64, W75-W99, X10-X39, X50-X59, Y10-Y89",
  Cause = "All other external causes",
  stringsAsFactors = F)

df_icd10 <- bind_rows(df_icd10, manual_1025, manual_1046, manual_1103) %>%
  arrange(as.numeric(Code)) %>%
  mutate(across(everything(), ~ trimws(.)))  # optional cleanup of leading/trailing spaces

kable(head(df_icd10, 10))
Code Code_range Cause
1000 All causes
1001 A00-B99 Certain infectious and parasitic diseases
1002 A00 Cholera
1003 A09 Diarrhoea and gastroenteritis of presumed infectious origin
1004 A01-A08 Other intestinal infectious diseases
1005 A15-A16 Respiratory tuberculosis
1006 A17-A19 Other tuberculosis
1007 A20 Plague
1008 A33-A35 Tetanus
1009 A36 Diphtheria

Next, I wrote a function to expand ICD code ranges—for example, converting A00–A02, A04–A05 into the full set of codes (A00, A01, A02, A04, A05), and converted the data into a long-format mapping file, with each individual ICD-10 code represented as its own row:

Show code
expand_icd_range <- function(range) {
  # Split into start/end codes
  parts <- strsplit(range, "-")[[1]]
  start <- parts[1]
  end   <- parts[2]
  
  # Extract letter and numbers
  start_letter <- substr(start, 1, 1)
  end_letter   <- substr(end,   1, 1)
  start_num    <- as.integer(substr(start, 2, 3))
  end_num      <- as.integer(substr(end,   2, 3))
  
  letters_seq <- LETTERS[match(start_letter, LETTERS):match(end_letter, LETTERS)]
  
  out <- c()
  for (L in letters_seq) {
    if (L == start_letter && L == end_letter) {
      nums <- start_num:end_num
    } else if (L == start_letter) {
      nums <- start_num:99
    } else if (L == end_letter) {
      nums <- 0:end_num
    } else {
      nums <- 0:99
    }
    out <- c(out, sprintf("%s%02d", L, nums))
    }
  out
}


df_icd10_long <- df_icd10 %>%
  mutate(is_range = str_detect(Code_range, "^[A-Z][0-9]{2}-[A-Z][0-9]{2}$")) %>%
  rowwise() %>%
  mutate(
    Expanded_codes = list(
      if (is_range) {
        expand_icd_range(Code_range)
      } else {
        Code_range          # single code or label
      })) %>%
  ungroup() %>%
  unnest(Expanded_codes)

kable(head(df_icd10_long %>% select(Code, Code_range, Cause, Expanded_codes), 5))
Code Code_range Cause Expanded_codes
1000 All causes All causes
1001 A00-B99 Certain infectious and parasitic diseases A00
1001 A00-B99 Certain infectious and parasitic diseases A01
1001 A00-B99 Certain infectious and parasitic diseases A02
1001 A00-B99 Certain infectious and parasitic diseases A03

However, several codes appear more than once, meaning they map to multiple cause categories rather than having a one-to-one relationship:

Show code
summary_dup <- df_icd10_long %>%
  filter(Code != "1000") %>%  
  group_by(Expanded_codes) %>% 
  summarize(n = n(),   
            Codes = paste(unique(Code), collapse=", ")) %>%
  filter(n > 1) %>%   
  arrange(Expanded_codes)

kable(summary_dup %>% head(5))
Expanded_codes n Codes
A00 2 1001, 1002
A01 2 1001, 1004
A02 2 1001, 1004
A03 2 1001, 1004
A04 2 1001, 1004
Show code
print(paste0("Number of duplicated codes: ", nrow(summary_dup)))
[1] "Number of duplicated codes: 810"

I observed that this is because some ICD codes fall under both a detailed category and a broader overarching category. For example, cholera corresponds to the A00 (Code: 1002), but it is also included within the broader range A00–B99 (Code: 1001), which represents “Certain infectious and parasitic diseases.” To preserve this hierarchical structure, I mapped each ICD code to both its detailed code category (e.g., 1002, which corresponds to Cholera) and its broader category (e.g., 1001, which correspond to Certain infectious and parasitic diseases), such that:

Show code
df_icd10 <- df_icd10_long %>%
  group_by(Code) %>%
  mutate(code_count = n_distinct(Expanded_codes)) %>%
  ungroup() %>%
  arrange(Expanded_codes, code_count) %>% 
  group_by(Expanded_codes) %>%
  mutate(rank = row_number()) %>% 
  ungroup() %>%
  pivot_wider(
    id_cols   = Expanded_codes,
    names_from = rank,
    values_from = c(Code, Code_range, Cause),
    names_sep = "_") %>%
  rename(
    Code_detailed       = Code_1,
    Code_broader        = Code_2,
    Code_range_detailed = Code_range_1,
    Code_range_broader  = Code_range_2,
    Cause_detailed      = Cause_1,
    Cause_broader       = Cause_2)

example <- df_icd10 %>%
  mutate(Expanded_codes = Expanded_codes,
         Code_range_detailed = paste0(Code_range_detailed, " (", 
                                      Code_detailed, "; ", 
                                      Cause_detailed, ")"), 
         Code_range_broad = paste0(Code_range_broader, " (", 
                                      Code_broader, "; ", 
                                      Cause_broader, ")"), ) %>%
  select(Expanded_codes, Code_range_detailed, Code_range_broad)

kable(head(example, 5)) %>%
  kable_styling(full_width = FALSE) %>%
  column_spec(1, width = "5%") %>% 
  column_spec(2, width = "45%") %>%  
  column_spec(3, width = "50%")
Expanded_codes Code_range_detailed Code_range_broad
A00 A00 (1002; Cholera) A00-B99 (1001; Certain infectious and parasitic diseases)
A01 A01-A08 (1004; Other intestinal infectious diseases) A00-B99 (1001; Certain infectious and parasitic diseases)
A02 A01-A08 (1004; Other intestinal infectious diseases) A00-B99 (1001; Certain infectious and parasitic diseases)
A03 A01-A08 (1004; Other intestinal infectious diseases) A00-B99 (1001; Certain infectious and parasitic diseases)
A04 A01-A08 (1004; Other intestinal infectious diseases) A00-B99 (1001; Certain infectious and parasitic diseases)
Show code
rm(manual_1025, manual_1046, manual_1103, split_mat, df_icd10_long, summary_dup, example)

Combining Mortality with ICD-10 Code

To merge the mortality data set with the ICD mapping table, I first split the mortality data into two subsets: one containing four-digit WHO cause codes (e.g., 1001) and another containing ICD-10 alphanumeric codes (e.g., A00). Each subset was then merged with the corresponding portion of the ICD mapping table using the appropriate key, and the results were combined to create a unified data set.

Show code
df <- df %>% mutate(cause_is_4digit = str_detect(Cause, "^[0-9][0-9][0-9][0-9]$"))

df_broad_icd <- df %>%
  filter(cause_is_4digit) %>%
  left_join(
    df_icd10 %>%
      select(-Expanded_codes) %>% 
      distinct(Code_detailed, .keep_all = T),
    by = c("Cause" = "Code_detailed")) %>% 
  mutate(Code_detailed = Cause) %>%
  select(c(
  "Country", "Year", "Cause", "Sex", "Age_group", "Death_count", "avail", "cause_is_4digit", 
  "Code_detailed", "Code_broader", "Code_range_detailed", "Code_range_broader",
  "Cause_detailed", "Cause_broader"))

df_detailed_icd <- df %>%
  filter(!cause_is_4digit) %>%
  left_join(
    df_icd10,
    by = c("Cause" = "Expanded_codes")
  )

df <- rbind(df_detailed_icd, df_broad_icd) %>%
  select(-c(cause_is_4digit)) %>% 
  rename(Code = Cause) %>% 
  select(c(Country, Year, Sex, Age_group, Death_count, avail, 
           Code, Cause_detailed, Cause_broader,
           Code_detailed, Code_range_detailed, 
           Code_broader, Code_range_broader))

kable(head(df, 2))
Country Year Sex Age_group Death_count avail Code Cause_detailed Cause_broader Code_detailed Code_range_detailed Code_broader Code_range_broader
Egypt 2014 1 Deaths1 21 1 A01 Other intestinal infectious diseases Certain infectious and parasitic diseases 1004 A01-A08 1001 A00-B99
Egypt 2014 1 Deaths2 5 1 A01 Other intestinal infectious diseases Certain infectious and parasitic diseases 1004 A01-A08 1001 A00-B99
Show code
rm(df_broad_icd, df_detailed_icd, df_icd10)

After incorporating the ICD-10 tabulation list, the data set now contains 106 detailed causes of death and 13 broad cause categories:

Show code
count_sex <- table(df$Sex, useNA = "ifany")
kable(t(data.frame(
  Country = length(unique(df$Country)),
  Detailed_Cause_of_Death = length(unique(df$Cause_detailed)),
  Broad_Cause_of_Death = length(unique(df$Cause_broader)),
  Years   = paste(sort(unique(df$Year)), collapse = ", "),
  Sex     = paste0("male: ",   
                   100 * round((count_sex["1"]) / nrow(df), 4), "%, ",
                   "female: ", 
                   100 *round((count_sex["2"]) / nrow(df), 4), "%, ",
                   "unknown ", 
                   100 * round((count_sex["9"]) / nrow(df), 4), "%"),
  stringsAsFactors = FALSE)), 
  align = c("l", "l"))
Country 134
Detailed_Cause_of_Death 107
Broad_Cause_of_Death 13
Years 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023
Sex male: 52.09%, female: 47.47%, unknown 0.43%
Show code
rm(count_sex)